package top.rushpeak.edu03.crawler.lottery;

import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

public class LotteryCrawler extends WebCrawler {

	public final static String TOURL = "http://kaijiang.aicai.com/cqssc/"; 
	
	//过滤掉这些格式文件
	private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
            + "|png|mp3|mp3|zip|gz))$");

	@Override
	public boolean shouldVisit(Page referringPage, WebURL url) {
        String href = url.getURL().toLowerCase();
        return !FILTERS.matcher(href).matches()
               && href.startsWith(TOURL);
	}

	@Override
	public void visit(Page page) {
        logger.info("开始解析信息");
        
        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String html = htmlParseData.getHtml();
            Document doc = Jsoup.parse(html);
            Element tbody = doc.getElementById("jq_body_kc_result");
            if(tbody==null){
            	System.out.println("无可用tbody");
            	return;
            }
            Elements trs = tbody.getElementsByTag("tr");
            if(trs==null){
            	System.out.println("无可用trs");
            	return;
            }
            int count=0;
            for(Element tr:trs){
            	if(count>=10){
            		break;
            	}
            	Elements tds = tr.getElementsByTag("td");
            	if(tds.size()==3){
            		String id = tds.get(0).text();
            		String luckyNums = tds.get(2).text();
            		System.out.println(id+"中奖号码："+luckyNums);
            		count++;
            	}
            }
        }
        logger.info("结束解析信息");
	}
	
	
}
